The NVIDIA System Management Interface (nvidia-smi) is a command line utility, based on top of the NVIDIA Management Library (NVML), intended to aid in the management and monitoring of NVIDIA GPU devices.
!nvidia-smi
# install dependencies:
# !pip install cython pyyaml==5.1
# !pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
# import torch, torchvision
# print(torch.__version__, torch.cuda.is_available())
# !gcc --version
!pip install pyyaml==5.1 pycocotools>=2.0.1
import torch, torchvision
print(torch.__version__, torch.cuda.is_available())
!gcc --version
# opencv is pre-installed on colab
assert torch.__version__.startswith("1.6")
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.6/index.html
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import random
import cv2
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.engine import DefaultTrainer
from detectron2.data import DatasetCatalog, MetadataCatalog
import os
import json
from detectron2.structures import BoxMode
from google.colab import drive
drive.mount('/content/gdrive')
train_df = pd.read_csv('/content/gdrive/My Drive/Global Wheat Detection/train.csv')
train_path = "/content/gdrive/My Drive/Global Wheat Detection/train"
train_df.head()
# train_df['width'].unique()
# train_df['height'].unique()
for g in train_df.groupby('image_id'):
b = g[1]['bbox'].values
print(type(b),b)
break
Grouping all the boxes according to the name of the image
total_data = []
for g in tqdm_notebook(train_df.groupby('image_id')):
data = {}
data['filename'] = g[0]
data['bbox'] = g[1]['bbox'].values
total_data.append(data)
In total there are 3373 images
len(total_data)
For standard tasks, we load the original dataset into list[dict] with a specification similar to COCO’s json annotations. This is our standard representation for a dataset.
file_name: the full path to the image file. Rotation or flipping may be applied if the image has EXIF metadata.
height, width: integer. The shape of the image.
image_id(str or int): a unique id that identifies this image. Required by evaluation to identify the images, but a dataset may use it for different purposes.
bbox (list[float]): list of 4 numbers representing the bounding box of the instance.
bbox_mode (int): the format of bbox. It must be a member of structures.BoxMode. Currently supports: BoxMode.XYXY_ABS, BoxMode.XYWH_ABS.
category_id (int): an integer in the range [0, num_categories-1] representing the category label. The value num_categories is reserved to represent the “background” category, if applicable.
annotations (list[dict]): each dict corresponds to annotations of one instance in this image. Required by instance detection/segmentation or keypoint detection tasks, but can be an empty list.
def get_wheat_dicts(total_data):
dataset_dicts = []
for idx, v in enumerate(total_data):
record = {}
filename = os.path.join(train_path, v["filename"]+'.jpg')
height, width = 1024,1024
record["file_name"] = filename
record["image_id"] = idx
record["height"] = height
record["width"] = width
objs = []
for b in v['bbox']:
b = json.loads(b)
obj = {
'bbox': list(b),
'bbox_mode': BoxMode.XYWH_ABS,
'category_id':0,
}
objs.append(obj)
record["annotations"] = objs
dataset_dicts.append(record)
return dataset_dicts
Register a Dataset - DatasetCatalog
The function can do arbitrary things and should returns the data in either of the following format:
Detectron2’s standard dataset dict, This will make it work with many other builtin features in detectron2, so it’s recommended to use it when it’s sufficient.
Metadata - MetadataCatalog
Metadata is a key-value mapping that contains information that’s shared among the entire dataset, and usually is used to interpret what’s in the dataset, e.g., names of classes, colors of classes, root of files, etc.
If you register a new dataset through DatasetCatalog.register, you may also want to add its corresponding metadata through MetadataCatalog.get(dataset_name).some_key = some_value, to enable any features that need the metadata.
#split data 90% train 10% to val
index = int(0.9 * len(total_data))
train_data = total_data[:index]
val_data = total_data[index:]
folders = ['train', 'val']
for i, d in enumerate([train_data,val_data]):
DatasetCatalog.register("wheat_" + folders[i], lambda d=d: get_wheat_dicts(d))
MetadataCatalog.get("wheat_" + folders[i]).set(thing_classes=["wheat"])
Visualizer that draws data about detection/segmentation on images.
It contains methods like draw_{text,box,circle,line,binary_mask,polygon} that draw primitive objects to images
visualizer.draw_dataset_dict - Draw annotations/segmentaions in Detectron2 Dataset format.
Args:
dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
Returns:
output (VisImage): image object with visualizations.
wheat_metadata = MetadataCatalog.get("wheat_train")
# train_data[0]['bbox'][0]
dataset_dicts = get_wheat_dicts(train_data)
for d in random.sample(dataset_dicts, 3):
img = cv2.imread(d["file_name"])
visualizer = Visualizer(img[:, :, ::-1], metadata=wheat_metadata, scale=1)
vis = visualizer.draw_dataset_dict(d)
plt.figure(figsize=[10, 20])
plt.imshow(vis.get_image()[:,:,::-1])
plt.show()
RetinaNet adopts the Feature Pyramid Network (FPN) proposed by Lin, Dollar, et al. (2017) as its backbone, which is in turn built on top of ResNet in a fully convolutional fashion. The fully convolutional nature enables the network to take an image of an arbitrary size and outputs proportionally sized feature maps at multiple levels in the feature pyramid.

RetinaNet Explained and Demystified
cfg.merge_from_file - load values from a file yaml cfg.MODEL.WEIGHTS - load WEIGHTS from a file yaml
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/retinanet_R_101_FPN_3x.yaml"))
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/retinanet_R_101_FPN_3x.yaml")
This is the number of foreground classes, we have only wheat class.
cfg.MODEL.RETINANET.NUM_CLASSES = 1
Inference cls score threshold, only anchors with score > INFERENCE_TH are considered for inference (to improve speed)
cfg.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05
cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000
cfg.MODEL.RETINANET.NMS_THRESH_TEST = 0.5
Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets
cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
Loss parameters

cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0
cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25
cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.3
def retinanet_setup():
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/retinanet_R_101_FPN_3x.yaml"))
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/retinanet_R_101_FPN_3x.yaml")
cfg.MODEL.RETINANET.NUM_CLASSES = 1
cfg.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05
cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000
cfg.MODEL.RETINANET.NMS_THRESH_TEST = 0.5
cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0
cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25
cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.3
def faster_rcnn_setup ():
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml"))
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
# cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
General config for all models in detectron2
cfg = get_cfg()
retinanet_setup()
# cfg.MODEL.WEIGHTS = '/content/gdrive/My Drive/Global Wheat Detection/yonatan_checkpoints/outputs/model_final.pth'
cfg.DATASETS.TRAIN = ("wheat_train",)
cfg.DATASETS.TEST = ()
cfg.DATALOADER.NUM_WORKERS = 4
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.00025
cfg.SOLVER.GAMMA = 0.05
cfg.SOLVER.MAX_ITER = 30000
cfg.SOLVER.MOMENTUM = 0.9
# Save a checkpoint after every this number of iterations
cfg.SOLVER.CHECKPOINT_PERIOD = 10000
cfg.TEST.EVAL_PERIOD =1000
In this section we use DefaultTrainer to create a trainer , The trainer will train the model according to config set for him in the section above.
cfg.OUTPUT_DIR = '/content/gdrive/My Drive/Global Wheat Detection/yonatan_checkpoints/outputs'
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()
load model with DATASETS.TEST = "wheat_val"
# cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.WEIGHTS = '/content/gdrive/My Drive/Global Wheat Detection/yonatan_checkpoints/outputs/model_final.pth'
# cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7
cfg.MODEL.RETINANET.SCORE_THRESH_TEST = 0.5
cfg.DATASETS.TEST = ("wheat_val", )
predictor = DefaultPredictor(cfg)
from detectron2.utils.visualizer import ColorMode
dataset_dicts = get_wheat_dicts(val_data)
# wheat_metadata = MetadataCatalog.get("wheat_val")
for d in random.sample(dataset_dicts, 3):
im = cv2.imread(d["file_name"])
outputs = predictor(im)
# print(outputs)
v = Visualizer(im[:, :, ::-1],
metadata=wheat_metadata,
scale=0.8,
)
v = v.draw_instance_predictions(outputs["instances"].to("cpu"))
plt.figure(figsize=[10, 20])
plt.imshow(v.get_image()[:,:,::-1])
plt.show()
This competition is evaluated on the mean average precision at different intersection over union (IoU) thresholds.
To understand mAP, we will explain about precision and recall first.
mAP (mean average precision) is the average of AP. In some context, we compute the AP for each class and average them. But in some context, they mean the same thing. For example, under the COCO context, there is no difference between AP and mAP.
Evaluate AR for object proposals, AP for instance detection/segmentation, AP for keypoint detection outputs using COCO's metrics
AP (Average precision) is a popular metric in measuring the accuracy of object detectors like Faster R-CNN, SSD, etc. Average precision computes the average precision value for recall value over 0 to 1.

#//AP=46.032 BASE_LR = 0.01 TS=0.5 SOLVER.MOMENTUM=0.9 IMS_PER_BATCH = 8 FOCAL_LOSS_GAMMA = 0.9 FOCAL_LOSS_ALPHA = 0.6 SMOOTH_L1_LOSS_BETA = 0.2//
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import DatasetCatalog, MetadataCatalog, build_detection_test_loader
evaluator = COCOEvaluator("wheat_val", cfg, False, output_dir="./output/")
val_loader = build_detection_test_loader(cfg, "wheat_val")
inference_on_dataset(predictor.model, val_loader, evaluator)
from pathlib import Path
# data_dir = Path('/content/gdrive/My Drive/Global Wheat Detection')
# train_img_dir = Path(data_dir / 'train')
test_img_dir = Path('/content/gdrive/My Drive/Global Wheat Detection/test')
sub_path = Path(data_dir / 'sample_submission.csv')
sub_df = pd.read_csv(sub_path)
from tqdm import tqdm
def submit():
for idx, row in tqdm(sub_df.iterrows(), total=len(sub_df)):
img_path = os.path.join(test_img_dir, row.image_id + '.jpg')
img = cv2.imread(img_path)
outputs = predictor(img)['instances']
boxes = [i.cpu().detach().numpy() for i in outputs.pred_boxes]
scores = outputs.scores.cpu().detach().numpy()
list_str = []
for box, score in zip(boxes, scores):
box[3] -= box[1]
box[2] -= box[0]
box = list(map(int,box))
score = round(score, 4)
list_str.append(score)
list_str.extend(box)
sub_df.loc[idx, 'PredictionString'] = ' '.join(map(str, list_str))
return sub_df
sub_df = submit()
sub_df.to_csv('submission.csv', index=False)
sub_df